1. Parameter Estimation
real_gene_count <- read.table("/Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis/ISLET simulation/GSE60424_GEOSubmit_FC1to11_normalized_counts.txt", row.names = 1)
data <- getGEO("GSE60424")
gset <- data[[1]]
real_pdata <- pData(gset)
colnames(real_gene_count) <- real_pdata$geo_accession
# 指定 Ensembl 物种和版本
ensembl <- biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# 获取 Ensembl ID 和基因名称的映射
annotation <- biomaRt::getBM(attributes = c("ensembl_gene_id", "external_gene_name"), mart = ensembl)
# 使用映射将 Ensembl ID 转换为基因名称
gene_names <- annotation$external_gene_name[match(rownames(real_gene_count), annotation$ensembl_gene_id)] ## 存在NA和""
# 查找 Ensembl ID 中不存在于数据库中的项
missing_ids <- setdiff(rownames(real_gene_count), annotation$ensembl_gene_id)
# 打印不存在的 Ensembl ID
cat("Number of ensemble ID not exists in biomaRt database:", length(missing_ids))
# 更新 real_gene_count 和 gene_names
real_gene_count <- real_gene_count[!is.na(gene_names) & gene_names != "", ]
gene_names <- gene_names[!is.na(gene_names) & gene_names != ""]
## 存在一个gene id对应多个ensemble id的情况,此处直接删除
duplicate_indices <- duplicated(gene_names)
duplicated_genes <- gene_names[duplicate_indices]
print(unique(duplicated_genes))
real_gene_count <- real_gene_count[!duplicate_indices, ]
gene_names <- gene_names[!duplicate_indices]
rownames(real_gene_count) <- gene_names
cat("Number of genes in bulk reference matrix after mapping ensemble id to gene names:", nrow(real_gene_count))
saveRDS(real_gene_count, file = "real_gene_count.rds")
saveRDS(real_pdata, file = "real_pdata.rds")
################ meta data ################
real_gene_count = readRDS("real_gene_count.rds")
real_pdata = readRDS("real_pdata.rds")
real_meta_data <- data.frame(group = ifelse(real_pdata$`diseasestatus:ch1` == "Healthy Control", "ctrl",
"case"),
subject_ID = as.numeric(real_pdata$`donorid:ch1`),
age = as.numeric(real_pdata$`age:ch1`),
cellType = real_pdata$`celltype:ch1`)
rownames(real_meta_data) <- real_pdata$geo_accession
# 去除细胞类型为"Whole Blood"的样本
filtered_meta_data <- subset(real_meta_data, cellType != "Whole Blood")
filtered_gene_count <- real_gene_count[colnames(real_gene_count) %in% rownames(filtered_meta_data), ]
# 确保行名的顺序匹配
filtered_gene_count <- filtered_gene_count[, rownames(filtered_meta_data)]
ctrl_meta_data <- subset(filtered_meta_data, group == "ctrl")
case_meta_data <- subset(filtered_meta_data, group == "case")
ctrl_gene_count <- filtered_gene_count[, rownames(ctrl_meta_data)]
case_gene_count <- filtered_gene_count[, rownames(case_meta_data)]
ctrl_gene_count <- ctrl_gene_count
################
We use DESeq2 to do the parameter estimation.
We get the mean expression of gene and perform log translation to it,
save it as mu_m.rds.
In the reference panel, we assume the mean expression of gene \(g\) for subject \(j\) at time follows \(N(\mu _m, \sigma_m^2)\), where \(\sigma_m\) is estimated by
estimateDispersions function in DESeq2. We
save it as var_m.rds.
dds <- DESeqDataSetFromMatrix(countData = ctrl_gene_count,
colData = ctrl_meta_data,
design = ~ 1)
dds
## class: DESeqDataSet
## dim: 29920 24
## metadata(1): version
## assays(1): counts
## rownames(29920): FGR CFH ... LINC00662 IGHV3OR16-15
## rowData names(0):
## colnames(24): GSM1479438 GSM1479439 ... GSM1479524 GSM1479525
## colData names(4): group subject_ID age cellType
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 96 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## log2 fold change (MLE): Intercept
## Wald test p-value: Intercept
## DataFrame with 29920 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## FGR 857.34899 9.74374 0.443362 21.97692 4.78836e-107
## CFH 2.73618 1.45216 0.503580 2.88368 3.93055e-03
## FUCA2 17.96582 4.16718 0.312560 13.33243 1.49937e-40
## GCLC 20.26188 4.34070 0.189674 22.88500 6.55419e-116
## NFYA 82.97725 6.37464 0.164013 38.86666 0.00000e+00
## ... ... ... ... ... ...
## CYP3A54P 0.000000 NA NA NA NA
## LINC02164 0.000000 NA NA NA NA
## TUBB8P7 0.127477 -2.971695 1.658891 -1.791375 0.0732332
## LINC00662 0.879693 -0.184927 0.347799 -0.531708 0.5949283
## IGHV3OR16-15 0.000000 NA NA NA NA
## padj
## <numeric>
## FGR 1.38268e-106
## CFH 6.03569e-03
## FUCA2 3.13892e-40
## GCLC 1.96268e-115
## NFYA 0.00000e+00
## ... ...
## CYP3A54P NA
## LINC02164 NA
## TUBB8P7 0.0841442
## LINC00662 0.6180206
## IGHV3OR16-15 NA
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates

mu_m <- data.frame(res$baseMean)
#mu_m <- log(mu_m)
rownames(mu_m) <- rownames(ctrl_gene_count)
dispersion_m <- data.frame(dispersions(dds))
rownames(dispersion_m) = rownames(dispersion_m)
# 标记 mu_m 中包含 -Inf 的行
non_inf_rows <- rowSums(mu_m == -Inf) == 0
# 标记 var_m 中包含 NA 的行
non_na_rows <- rowSums(is.na(dispersion_m)) == 0
# 取交集:同时满足 non_inf_rows 和 non_na_rows 的行
valid_rows <- non_inf_rows & non_na_rows
gene <- rownames(mu_m)[valid_rows]
# 筛选出 mu_m 和 var_m 中保留的行
mu_m_filtered <- mu_m[valid_rows, ]
dispersion_m_filtered <- dispersion_m[valid_rows, ]
mu_m <- data.frame(mu_m_filtered)
rownames(mu_m) = gene
mu_m <- head(mu_m, 5000)
dispersion_m <- data.frame(dispersion_m_filtered)
rownames(dispersion_m) = gene
dispersion_m <- head(dispersion_m, 5000)
var_m = 1/mu_m + dispersion_m
In DESeq2, variance \(\sigma^2
= \mu + \alpha\mu^2\), here \(\mu\) is mean and \(\alpha\) is the dispersion. Since in the
following simulation we simulate count in log scale, and by Taylor
Expansion we know that \(log(X) ≈
log(\mathbb{E}[X])+ \mathbb{E}[X] (X−\mathbb{E}[X])\), therefore
\(\mathrm{Var}[log(X)] = \displaystyle
\frac{\mathrm{Var}[X]}{\mathbb{E}^2[X]}\)
saveRDS(log(mu_m), file="mu_m.rds")
saveRDS(dispersion_m, file = "dispersion_m.rds")
saveRDS(var_m, file = "var_m.rds")
mu_m = readRDS("mu_m.rds")
mu_m = mu_m$mu_m_filtered
sigma_m = readRDS("var_m.rds")
cellType_list = c("B-cells", "CD4", "CD8", "NK", "Neutrophils", "Monocytes")
gene_list = rownames(sigma_m)
2. Temporal cell type proportions
We assume the proportion \(\theta_{jt}\) of each subject \(j\) for \(K\) cell types at time \(t\) follows Dirichlet distribution with
parameter \(\alpha\), where \(\alpha\) and \(\theta_{jt}\) are both \(K \times 1\) vectors, \(\mathbf{1}^T\theta_{jt} = 1\).
################# Notation Setup #################
#### input ####
# J: number of subject (must be an even number)
# T: number of time point (must be an even number)
# alpha_ls: cell population composition parameters for Dirichlet dist. (list(alpha_c, alpha_d))
# cell_ls: list of cell types
#### output ####
# theta: cell type proportion
#### This `Gen_prop` function is for generating cell type proportion
Gen_prop <- function(J, T, alpha_ls, cell_ls) {
set.seed(123)
alpha_c = alpha_ls[["alpha_c"]] # Dirichilet distribution params for control group
alpha_d = alpha_ls[["alpha_d"]] # Dirichilet distribution params for case group
ctrl_theta <- rdirichlet(n=J*T/2, alpha = alpha_c)
case_theta <- rdirichlet(n=J*T/2, alpha = alpha_d)
colnames(ctrl_theta) <- cell_ls
colnames(case_theta) <- cell_ls
rownames(case_theta) <- seq(from = 1, to = J * T / 2) # in accordance with sample_ID in metadata
rownames(ctrl_theta) <- seq(from = J * T / 2 + 1, to = J * T)
theta <- rbind(case_theta, ctrl_theta) # case is on the top
return(theta = t(theta))
# transpose: let theta be a n*K matrix (n is the number of samples)
}
In the paper, author provided us with the parameter value for the
Dirichlet distribution:
\[\alpha_c = (8.85, 6.49, 5.98, 5.28,
4.22, 3.85)\]
\[\alpha_d = (1.90, 2.25, 2.10, 5.72,
7.33, 15.37)\]
ctrl_alpha <- c(8.85, 6.49, 5.98, 5.28, 4.22, 3.85) # ctrl group
case_alpha <- c(1.90, 2.25, 2.10, 5.72, 7.33, 15.37) # case group
alpha_list = list(alpha_c = ctrl_alpha, alpha_d = case_alpha)
theta = Gen_prop(J=50, T=5, alpha_ls = alpha_list, cell_ls = cellType_list)
theta[, 1:5]
## 1 2 3 4 5
## B-cells 0.02230079 0.01501410 0.01314183 0.04691058 0.04692466
## CD4 0.06623229 0.08058458 0.06463024 0.12883512 0.12880281
## CD8 0.07401944 0.05478895 0.14144167 0.03732913 0.10845406
## NK 0.14745136 0.11669351 0.20039315 0.18620546 0.17347204
## Neutrophils 0.30466902 0.20367899 0.21962805 0.28593910 0.24653747
## Monocytes 0.38532711 0.52923987 0.36076506 0.31478060 0.29580895
3. Individual reference panel and temporal gene expression


For gene \(g\), the mean expression
of cell type \(k\) per subject \(j\) at time \(t\) is \(M_{gjkt}
\sim N(\mu^{t_j}_k, \sigma_g^2)\) (\(\sigma_g\) is the overdispersion of each
gene estimated by DESeq2).
The true reference matrix is \(\lambda_{gjkt} \sim
\Gamma(\mathrm{exp}(-\Phi_{gjk}),
M_{gjkt}\mathrm{exp}(\Phi_{gjk}))\), where \(M_{gjkt}\) and \(\Phi_{gjk}\) are components in vector \(\mathbf{M_{gjt}}, \mathbf{\Phi_{gj}}\), and
\(\mathrm{exp}(-\Phi_{gjk})\) is shape,
\(M_{gjkt}\mathrm{exp}(\Phi_{gjk})\) is
scale. (\(\Phi_{gjk} \sim N(-3,
1)\)).
Note that \(M_{gjkt}\) and \(\Phi_{gjk}\) are both in
log scale.
(1) Main Function
control_matrix <- function(cell_ls, gene_ls, DEG_ls, mu_jt, delta_jt) {
G = length(gene_ls) # number of genes
K = length(cell_ls)
control <- matrix(0, nrow = G, ncol = K)
colnames(control) = cell_ls
rownames(control) = gene_ls
for (k in 1:K) {
deg_indices = DEG_ls[[cell_ls[k]]]$Type3
control[deg_indices, ] = delta_jt # k*delta_jt
}
control = control + mu_jt
return(control)
}
case_matrix <- function(cell_ls, gene_ls, DEG_ls, mu_jt, delta_jt) {
G = length(gene_ls) # number of genes
K = length(cell_ls)
case <- matrix(0, nrow = G, ncol = K)
colnames(case) = cell_ls
rownames(case) = gene_ls
for (k in 1:K) {
deg_indices = DEG_ls[[cell_ls[k]]]$Type3
case[deg_indices, ] = delta_jt # k*delta_jt
case[deg_indices, k] = 0
}
case = case + mu_jt
return(case)
}
################# Notation Setup #################
#### input ####
# J: number of subject (must be an even number)
# T: number of time point (must be an even number)
# mu_m
# sigma_m
# gene_ls: list of gene names
# cell_ls: list of cell types
# Delta_0 (LFC): age-independent group effect (intercept)
# k_mu: slope in baseline group
# k_delta: age-dependent group effect (slope)
#### output ####
# M: mean expression of gene
# Phi: overdispersion
# DEG_ls: list of differentially expressed genes (list(Type1, Type2, Type3))
# lambda: true reference expression
# meta_data
#### This `Gen_ref` function is for generating longitudinal individual reference panel
Gen_ref <- function(J, T, mu_m, sigma_m,
gene_ls, cell_ls, LFC=1.5, k_mu = 0.01, k_delta = 0.1) {
set.seed(123)
### meta data
meta_data <- data.frame(sample_ID = 1:(J*T),
group = rep(c("case", "ctrl"), each = T*J/2),
subject_ID = rep(1:J, each=T),
time_point = rep(1:T, J))
### marker gene index generation
G = length(gene_ls) # number of genes
K = length(cell_ls)
sample_size <- ceiling(0.05 * G) # 5% DEG of each cell type
DEG_ls <- setNames(vector("list", length(cell_ls)), cell_ls)
for (k in 1:K) {
DEG_ls[[cell_ls[k]]] = list(
Type3 = seq(((k-1)*sample_size)+1, (k*sample_size))) }
### reference panel generation
mu_m = matrix(rep(mu_m, times = K), nrow = G, ncol = K)
rownames(mu_m) = gene_ls
colnames(mu_m) = cell_ls
sigma_m = data.frame(rep(sigma_m, each=K))
rownames(sigma_m) = gene_ls
colnames(sigma_m) = cell_ls
### Generate M
M_ls = list()
shift = ceiling(T/2)
for (j in (1:(J %/% 2))){
mu_jt = mu_m
delta_jt = LFC
for (t in (1:T)) { # linear
#if (t %in% (1:shift)) {mu_m_case = mu_m_case + Delta3}
#else {mu_m_case = mu_m_case - Delta3}
mu_jt = mu_jt + k_mu*t
delta_jt = delta_jt + k_delta*t
mu_m_case = case_matrix(cell_ls = cell_ls, gene_ls = gene_ls, DEG_ls = DEG_ls, mu_jt = mu_jt, delta_jt = delta_jt)
M <- mapply(rnorm, n = 1, mean = mu_m_case, sd = sigma_m)
M <- matrix(M, nrow = nrow(mu_m_case), ncol = ncol(mu_m_case),
dimnames = dimnames(mu_m_case))
# current idx: (j-1)*T + t
M_ls[[as.character((j - 1) * T + t)]] <- M
}
}
for (j in (((J %/% 2)+1):J)){
mu_jt = mu_m
delta_jt = LFC
for (t in (1:T)) {
mu_jt = mu_jt + k_mu*t
delta_jt = delta_jt + k_delta*t
mu_m_ctrl = control_matrix(cell_ls = cell_ls, gene_ls = gene_ls, DEG_ls = DEG_ls, mu_jt = mu_jt, delta_jt = delta_jt)
M <- mapply(rnorm, n = 1, mean = mu_m_ctrl, sd = sigma_m)
M <- matrix(M, nrow = G, ncol = K,
dimnames = dimnames(mu_m_ctrl))
# current idx: (j-1)*T + t
M_ls[[as.character((j - 1) * T + t)]] <- M
}
}
## Generate Phi
Phi_ls = list()
for (j in 1:J) {
for (t in 1:T) {
# 生成 Phi 矩阵
Phi <- mapply(rnorm, n = G*K, mean = -3, sd = 1)
Phi <- matrix(Phi, nrow = G, ncol = K,
dimnames = dimnames(mu_m_ctrl))
Phi_ls[[as.character((j - 1) * T + t)]] <- Phi }
}
## Generate lambda
lambda_ls = list()
for (j in (1:J)){
for (t in (1:T)){
idx = (j - 1) * T + t
M = M_ls[[as.character(idx)]]
Phi = Phi_ls[[as.character(idx)]]
lambda <- matrix(nrow = nrow(M), ncol = ncol(M))
shape_matrix <- exp(-Phi)
rate_matrix <- 1 / (exp(M) * exp(Phi))
lambda <- mapply(function(shape, rate) {
if (shape <= 0 || rate <= 0) {
return(0)} else {
return(rgamma(n = 1, shape = shape, rate = rate))}},
as.vector(shape_matrix), as.vector(rate_matrix))
# reorganize lambda into a matrix
lambda <- matrix(lambda, nrow = nrow(M), ncol = ncol(M))
lambda[lambda > 20000] <- 20000
colnames(lambda) = colnames(M)
rownames(lambda) = rownames(M)
lambda_ls[[as.character(idx)]] <- lambda
}
}
return (list(DEG_ls = DEG_ls, M_ls = M_ls, Phi_ls = Phi_ls, lambda_ls=lambda_ls, meta_data = meta_data))
}
(2) Heatmap Function
create_heatmap_grob <- function(M, title, color_palette) {
pheatmap_obj <- pheatmap(M,
cluster_rows = FALSE,
cluster_cols = FALSE,
show_rownames = FALSE,
show_colnames = TRUE,
color = colorRampPalette(c("blue", "white", "red"))(100),
main = title,
silent = TRUE) # silent=TRUE 以避免直接打印
grid_plot <- pheatmap_obj$gtable
return(grid_plot)
}
5. \(\Delta_0\) = 0
ref = Gen_ref(J=50, T=5, mu_m = mu_m, sigma_m = sigma_m,
gene_ls = gene_list, cell_ls = cellType_list, LFC=0.0,
k_mu = 0.01, k_delta = 0.1)
meta_data <- ref$meta_data
lambda_list <- ref$lambda_ls
M_list <- ref$M_ls
Phi_list <- ref$Phi_ls
DEG_list <- ref$DEG_ls
head(meta_data)
## sample_ID group subject_ID time_point
## 1 1 case 1 1
## 2 2 case 1 2
## 3 3 case 1 3
## 4 4 case 1 4
## 5 5 case 1 5
## 6 6 case 2 1
Y <- Gen_mix(lambda_ls = lambda_list, theta = theta)
theta_df = as.data.frame(t(theta))
(1) Heatmap of \(M\) and \(\lambda\)
i. Mean expression \(M\)
heatmap_grobs <- lapply(c(1:5, 126:130), function(i) {
M <- M_list[[i]]
if (i %in% 1:5) {
create_heatmap_grob(M, paste("subject 1 (case) at time point ", i), color_palette) }
else {create_heatmap_grob(M, paste("subject 26 (ctrl) at time point ", i-125), color_palette)}
})
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 第i个热图
heatmap_grobs[[i + 5]], # 第i+5个热图
nrow = 1 # 一行两列
)
}





We examine whether DE genes we set are differentially expressed in
\(M\) by looking at the corresponding
index in case and control.
heatmap_grobs <- lapply(c(1:5), function(i) {
M_case <- M_list[[i]]
M_ctrl <- M_list[[i + 125]]
# 创建子集矩阵,只保留每列的250个元素
M_sub_case <- do.call(cbind, lapply(1:ncol(M_case), function(j) {
M_case[((j-1)*250 + 1):(j*250), j]
}))
M_sub_ctrl <- do.call(cbind, lapply(1:ncol(M_ctrl), function(j) {
M_ctrl[((j-1)*250 + 1):(j*250), j]
}))
# 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
M_combined <- cbind(M_sub_case, M_sub_ctrl)
# 创建热图 grob
create_heatmap_grob(M_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})
# 绘制热图
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 每个时间点的热图
nrow = 1 # 一行一列
)
}





ii. Ture reference panel \(\lambda\)
heatmap_grobs <- lapply(c(1:5), function(i) {
lambda_case <- lambda_list[[i]]
lambda_ctrl <- lambda_list[[i + 125]]
# 创建子集矩阵,只保留每列的250个元素
lambda_sub_case <- do.call(cbind, lapply(1:ncol(lambda_case), function(j) {
lambda_case[((j-1)*250 + 1):(j*250), j]
}))
lambda_sub_ctrl <- do.call(cbind, lapply(1:ncol(lambda_ctrl), function(j) {
lambda_ctrl[((j-1)*250 + 1):(j*250), j]
}))
# 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
lambda_combined <- cbind(lambda_sub_case, lambda_sub_ctrl)
# 创建热图 grob
create_heatmap_grob(lambda_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})
# 绘制热图
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 每个时间点的热图
nrow = 1 # 一行一列
)
}





(2) Deconvolve individual-specific reference panel
We deconvolve the individual-specific reference panel with
isletSolve() and compare the deconvolved reference panel
with true reference panel.
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
#age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrep(dat_se = N25_se)
N25_input
execution_time <- system.time({
N25_age.ref <- isletSolve(input=N25_input)
})
print(execution_time)
saveRDS(N25_age.ref, file="N25_ref_0_0.rds")
N25_age.ref <- readRDS("N25_ref_0_0.rds")
caseVal <- caseEst(N25_age.ref)
ctrlVal <- ctrlEst(N25_age.ref)
lambda_list_islet <- list()
for (i in 1:50) {
if (i %in% seq(1,25)) {
Bcell = caseVal[["B-cells"]][,i]
CD4 = caseVal[["CD4"]][,i]
CD8 = caseVal[["CD8"]][,i]
NK = caseVal[["NK"]][,i]
Neutrophils = caseVal[["Neutrophils"]][,i]
Monocytes = caseVal[["Monocytes"]][,i]
lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
colnames(lambda) = c("B-cells","CD4", "CD8", "NK", "Neutrophils", "Monocytes")
lambda_list_islet[[as.character(i)]] <- lambda
}
else {
Bcell = ctrlVal[["B-cells"]][,i-25]
CD4 = ctrlVal[["CD4"]][,i-25]
CD8 = ctrlVal[["CD8"]][,i-25]
NK = ctrlVal[["NK"]][,i-25]
Neutrophils = ctrlVal[["Neutrophils"]][,i-25]
Monocytes = ctrlVal[["Monocytes"]][,i-25]
lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
colnames(lambda) = c("B-cells","CD4", "CD8", "NK", "Neutrophils", "Monocytes")
lambda_list_islet[[as.character(i)]] <- lambda
}
}
true_ref_case <- lambda_list_islet[[1]][1:50, ]
islet_ref_case <- lambda_list[[1]][1:50, ]
ref_case_combined <- cbind(true_ref_case, islet_ref_case)
pheatmap(ref_case_combined,
cluster_rows = FALSE, # 关闭行聚类
cluster_cols = FALSE, # 关闭列聚类
show_rownames = FALSE, # 不显示行名
show_colnames = TRUE,
main = "True Reference Panel vs Deconvolved Reference Panel (Case)") # 显示列名

true_ref_ctrl <- lambda_list[[126]][1:50, ]
islet_ref_ctrl <- lambda_list_islet[[26]][1:50, ]
ref_ctrl_combined <- cbind(true_ref_ctrl, islet_ref_ctrl)
pheatmap(ref_ctrl_combined,
cluster_rows = FALSE, # 关闭行聚类
cluster_cols = FALSE, # 关闭列聚类
show_rownames = FALSE, # 不显示行名
show_colnames = TRUE,
main = "True Reference Panel vs Deconvolved Reference Panel (Ctrl)") # 显示列名

true_ref_case <- lambda_list[[1]]
islet_ref_case <- lambda_list_islet[[1]]
overall_correlation <- cor(c(true_ref_case), c(islet_ref_case))
print(paste("Correlation of case:", overall_correlation))
## [1] "Correlation of case: 0.214532247931833"
plot(c(log(true_ref_case+1)), c(log(islet_ref_case+1)), # +1 to avoid -Inf
main="ISLET estimated reference panel vs the true reference panel (Case)",
xlab="Log(True Reference)",
ylab="Log(Estimated Reference)",
pch=19,
col="blue")
abline(a=0, b=1, col="red", lwd=2)

true_ref_ctrl <- lambda_list[[126]]
islet_ref_ctrl <- lambda_list_islet[[26]]
overall_correlation <- cor(c(true_ref_ctrl), c(islet_ref_ctrl))
print(paste("Correlation of ctrl:", overall_correlation))
## [1] "Correlation of ctrl: 0.238309536979969"
plot(c(log(true_ref_ctrl+1)), c(log(islet_ref_ctrl+1)),
main="ISLET estimated reference panel vs the true reference panel (Ctrl)",
xlab="Log(True Reference)",
ylab="Log(Estimated Reference)",
pch=19,
col="blue")
abline(a=0, b=1, col="red", lwd=2)

(3) Test cell-type-specific differential expression (csDE)
genes
Slope test
Since \(\Delta_0 = 0\) in this
setting, we only conduct slope test.
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrepSlope(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## 1 2 3 4 5 6
## FGR 1642 4415 11255 617 13980 6415
## CFH 4 119 13 72 8 7
## FUCA2 58 20 2900 98 1914 143
## 126 127 128 129 130 131
## FGR 727 8200 7330 5902 8592 3099
## CFH 1 44 6 54 392 4
## FUCA2 34 81 378 117 158 182
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "B-cells" "CD4" "CD8" "NK" "Neutrophils"
## [6] "Monocytes"
## Total sample number and subject number:
## [1] 250 50
## Total case number and ctrl number:
## [1] 25 25
## First several subject ID for the samples:
## [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope):
## [1] "slope"
execution_time <- system.time({
N25_age.test <- isletTest(input = N25_input)
})
print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_slope_test_0_0.rds")
We used BH procedure to control FDR in multiple testing, and then
reported csDEG at pValue < 0.05.
N25_age.test_0.0 <- readRDS("N25_slope_test_0_0.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_0.0))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")
csDEG <- p_values_long %>%
filter(PValue < 0.05)
top_csDEG <- csDEG %>%
group_by(CellType) %>%
arrange(CellType, PValue) %>%
ungroup()
gene_list_by_celltype <- top_csDEG %>%
split(.$CellType) %>%
lapply(function(df) df$Gene)
islet_gene_list_by_celltype <- list()
for (cell_type in names(gene_list_by_celltype)) {
factor_vector <- gene_list_by_celltype[[cell_type]]
gene_names <- as.character(factor_vector)
islet_gene_list_by_celltype[[cell_type]] <- gene_names
}
true_gene_list_by_celltype <- list()
for (cell_type in names(DEG_list)) {
#type1_indices <- DEG_list[[cell_type]]$Type1
#type3_indices <- DEG_list[[cell_type]]$Type3
all_indices <- DEG_list[[cell_type]]$Type3
genes <- gene_list[all_indices]
true_gene_list_by_celltype[[cell_type]] <- genes
}
num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type
accuracy_list_0.0 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_0.0)) {
predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
#type1_indices <- true_gene_list_by_celltype[[cell_type]]
type3_indices <- true_gene_list_by_celltype[[cell_type]]
# Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
accuracy_list_0.0[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
Type3_accuracy = Type3_accuracy)
}
print(accuracy_list_0.0)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.072
##
##
## $CD4
## $CD4$Type3_accuracy
## [1] 0.068
##
##
## $CD8
## $CD8$Type3_accuracy
## [1] 0.068
##
##
## $NK
## $NK$Type3_accuracy
## [1] 0.128
##
##
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.096
##
##
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.148
accuracy_list_0.0_slope <- accuracy_list_0.0
We use ROC curve to illustrate the result.
K = length(cellType_list)
G = length(gene_list)
flag = matrix(0, nrow = G, ncol = K)
rownames(flag) = gene_list
colnames(flag) = cellType_list
for (k in 1:K) {
type3_indices = DEG_list[[cellType_list[k]]]$Type3
flag[type3_indices, k] = 1
}
ROC.DE <- function(DE.gs, pval) {
pred = prediction(1-pval, DE.gs)
perf = performance(pred,"tpr","fpr")
perf
}
roc = ROC.DE(flag, N25_age.test_0.0)
# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")
# 初始化空数据框
roc_data <- data.frame()
# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
# 提取当前cellType的fpr和tpr
fpr <- fpr_list[[k]]
tpr <- tpr_list[[k]]
# 生成临时数据框并添加到总数据框中
temp_data <- data.frame(
FPR = fpr,
TPR = tpr,
CellType = cellType_list[k]
)
roc_data <- rbind(roc_data, temp_data)
}
# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
geom_line(size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(title = "ROC Curve for DE Gene Detection (LFC = 0)",
x = "False Positive Rate (1-Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 14)) +
scale_color_brewer(palette = "Set1")

6. \(\Delta_0\) = 0.5
ref = Gen_ref(J=50, T=5, mu_m = mu_m, sigma_m = sigma_m,
gene_ls = gene_list, cell_ls = cellType_list, LFC=0.5,
k_mu = 0.01, k_delta = 0.1)
meta_data <- ref$meta_data
lambda_list <- ref$lambda_ls
M_list <- ref$M_ls
Phi_list <- ref$Phi_ls
DEG_list <- ref$DEG_ls
head(meta_data)
## sample_ID group subject_ID time_point
## 1 1 case 1 1
## 2 2 case 1 2
## 3 3 case 1 3
## 4 4 case 1 4
## 5 5 case 1 5
## 6 6 case 2 1
Y <- Gen_mix(lambda_ls = lambda_list, theta = theta)
theta_df = as.data.frame(t(theta))
(1) Heatmap of \(M\) and \(\lambda\)
i. Mean expression \(M\)
heatmap_grobs <- lapply(c(1:5, 126:130), function(i) {
M <- M_list[[i]]
if (i %in% 1:5) {
create_heatmap_grob(M, paste("subject 1 (case) at time point ", i), color_palette) }
else {create_heatmap_grob(M, paste("subject 26 (ctrl) at time point ", i-125), color_palette)}
})
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 第i个热图
heatmap_grobs[[i + 5]], # 第i+5个热图
nrow = 1 # 一行两列
)
}





We examine whether DE genes we set are differentially expressed in
\(M\) by looking at the corresponding
index in case and control.
heatmap_grobs <- lapply(c(1:5), function(i) {
M_case <- M_list[[i]]
M_ctrl <- M_list[[i + 125]]
# 创建子集矩阵,只保留每列的250个元素
M_sub_case <- do.call(cbind, lapply(1:ncol(M_case), function(j) {
M_case[((j-1)*250 + 1):(j*250), j]
}))
M_sub_ctrl <- do.call(cbind, lapply(1:ncol(M_ctrl), function(j) {
M_ctrl[((j-1)*250 + 1):(j*250), j]
}))
# 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
M_combined <- cbind(M_sub_case, M_sub_ctrl)
# 创建热图 grob
create_heatmap_grob(M_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})
# 绘制热图
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 每个时间点的热图
nrow = 1 # 一行一列
)
}





ii. Ture reference panel \(\lambda\)
heatmap_grobs <- lapply(c(1:5), function(i) {
lambda_case <- lambda_list[[i]]
lambda_ctrl <- lambda_list[[i + 125]]
# 创建子集矩阵,只保留每列的250个元素
lambda_sub_case <- do.call(cbind, lapply(1:ncol(lambda_case), function(j) {
lambda_case[((j-1)*250 + 1):(j*250), j]
}))
lambda_sub_ctrl <- do.call(cbind, lapply(1:ncol(lambda_ctrl), function(j) {
lambda_ctrl[((j-1)*250 + 1):(j*250), j]
}))
# 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
lambda_combined <- cbind(lambda_sub_case, lambda_sub_ctrl)
# 创建热图 grob
create_heatmap_grob(lambda_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})
# 绘制热图
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 每个时间点的热图
nrow = 1 # 一行一列
)
}





isletSolve.test <-function(input, BPPARAM=bpparam() ){
# islet.solve only runs on the model without age effect.
if(input@type == 'slope'){
stop('Input should be prepared by dataPrep()')
}
#make Yall a list
G <- nrow(input@exp_case)
Yall<-as.matrix(cbind(input@exp_case, input@exp_ctrl))
aval.nworkers<-BPPARAM$workers
block.size<-max(ceiling(G/aval.nworkers), 5)
Yall.list <- split(as.data.frame(Yall), ceiling(seq_len(G)/block.size))
# if(.Platform$OS.type == "unix") {
## do some parallel computation under Unix
# multicoreParam <- MulticoreParam(workers = ncores)
res <- bplapply(X=Yall.list, islet.solve.block, datuse=input, BPPARAM=BPPARAM)
# }
return(res)
}
(2) Deconvolve individual-specific reference panel
We deconvolve the individual-specific reference panel with
isletSolve() and compare the deconvolved reference panel
with true reference panel.
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
#age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrep(dat_se = N25_se)
N25_input
execution_time <- system.time({
N25_age.ref <- isletSolve(input=N25_input)
})
print(execution_time)
saveRDS(N25_age.ref, file="N25_ref_0_5.rds")
N25_age.ref <- readRDS("N25_ref_0_5.rds")
caseVal <- caseEst(N25_age.ref)
ctrlVal <- ctrlEst(N25_age.ref)
lambda_list_islet <- list()
for (i in 1:50) {
if (i %in% seq(1,25)) {
Bcell = caseVal[["B-cells"]][,i]
CD4 = caseVal[["CD4"]][,i]
CD8 = caseVal[["CD8"]][,i]
NK = caseVal[["NK"]][,i]
Neutrophils = caseVal[["Neutrophils"]][,i]
Monocytes = caseVal[["Monocytes"]][,i]
lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
colnames(lambda) = c("B-cells","CD4", "CD8", "NK", "Neutrophils", "Monocytes")
lambda_list_islet[[as.character(i)]] <- lambda
}
else {
Bcell = ctrlVal[["B-cells"]][,i-25]
CD4 = ctrlVal[["CD4"]][,i-25]
CD8 = ctrlVal[["CD8"]][,i-25]
NK = ctrlVal[["NK"]][,i-25]
Neutrophils = ctrlVal[["Neutrophils"]][,i-25]
Monocytes = ctrlVal[["Monocytes"]][,i-25]
lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
colnames(lambda) = c("B-cells","CD4", "CD8", "NK", "Neutrophils", "Monocytes")
lambda_list_islet[[as.character(i)]] <- lambda
}
}
true_ref_case <- lambda_list_islet[[1]][1:50, ]
islet_ref_case <- lambda_list[[1]][1:50, ]
ref_case_combined <- cbind(true_ref_case, islet_ref_case)
pheatmap(ref_case_combined,
cluster_rows = FALSE, # 关闭行聚类
cluster_cols = FALSE, # 关闭列聚类
show_rownames = FALSE, # 不显示行名
show_colnames = TRUE,
main = "True Reference Panel vs Deconvolved Reference Panel (Case)") # 显示列名

true_ref_ctrl <- lambda_list[[126]][1:50, ]
islet_ref_ctrl <- lambda_list_islet[[26]][1:50, ]
ref_ctrl_combined <- cbind(true_ref_ctrl, islet_ref_ctrl)
pheatmap(ref_ctrl_combined,
cluster_rows = FALSE, # 关闭行聚类
cluster_cols = FALSE, # 关闭列聚类
show_rownames = FALSE, # 不显示行名
show_colnames = TRUE,
main = "True Reference Panel vs Deconvolved Reference Panel (Ctrl)") # 显示列名

true_ref_case <- lambda_list[[1]]
islet_ref_case <- lambda_list_islet[[1]]
overall_correlation <- cor(c(true_ref_case), c(islet_ref_case))
print(paste("Correlation of case:", overall_correlation))
## [1] "Correlation of case: 0.229914795850654"
plot(c(log(true_ref_case+1)), c(log(islet_ref_case+1)), # +1 to avoid -Inf
main="ISLET estimated reference panel vs the true reference panel (Case)",
xlab="Log(True Reference)",
ylab="Log(Estimated Reference)",
pch=19,
col="blue")
abline(a=0, b=1, col="red", lwd=2)

true_ref_ctrl <- lambda_list[[126]]
islet_ref_ctrl <- lambda_list_islet[[26]]
overall_correlation <- cor(c(true_ref_ctrl), c(islet_ref_ctrl))
print(paste("Correlation of ctrl:", overall_correlation))
## [1] "Correlation of ctrl: 0.253906857043226"
plot(c(log(true_ref_ctrl+1)), c(log(islet_ref_ctrl+1)),
main="ISLET estimated reference panel vs the true reference panel (Ctrl)",
xlab="Log(True Reference)",
ylab="Log(Estimated Reference)",
pch=19,
col="blue")
abline(a=0, b=1, col="red", lwd=2)

(3) Test cell-type-specific differential expression (csDE)
genes
i. Intercept test
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
#age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrep(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## 1 2 3 4 5 6
## FGR 1734 7102 13741 929 16142 6666
## CFH 7 198 24 100 13 11
## FUCA2 91 41 4895 169 3025 181
## 126 127 128 129 130 131
## FGR 1169 10682 8723 6347 9193 5170
## CFH 2 71 6 107 642 6
## FUCA2 67 136 609 166 354 299
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "B-cells" "CD4" "CD8" "NK" "Neutrophils"
## [6] "Monocytes"
## Total sample number and subject number:
## [1] 250 50
## Total case number and ctrl number:
## [1] 25 25
## First several subject ID for the samples:
## [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope):
## [1] "intercept"
execution_time <- system.time({
N25_age.test <- isletTest(input = N25_input)
})
print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_intercept_test_0_5.rds")
We used BH procedure to control FDR in multiple testing, and then
reported csDEG at pValue < 0.05.
N25_age.test_0.5 <- readRDS("N25_intercept_test_0_5.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_0.5))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")
csDEG <- p_values_long %>%
filter(PValue < 0.05)
top_csDEG <- csDEG %>%
group_by(CellType) %>%
arrange(CellType, PValue) %>%
ungroup()
gene_list_by_celltype <- top_csDEG %>%
split(.$CellType) %>%
lapply(function(df) df$Gene)
islet_gene_list_by_celltype <- list()
for (cell_type in names(gene_list_by_celltype)) {
factor_vector <- gene_list_by_celltype[[cell_type]]
gene_names <- as.character(factor_vector)
islet_gene_list_by_celltype[[cell_type]] <- gene_names
}
true_gene_list_by_celltype <- list()
for (cell_type in names(DEG_list)) {
#type1_indices <- DEG_list[[cell_type]]$Type1
#type3_indices <- DEG_list[[cell_type]]$Type3
all_indices <- DEG_list[[cell_type]]$Type3
genes <- gene_list[all_indices]
true_gene_list_by_celltype[[cell_type]] <- genes
}
num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type
accuracy_list_0.5 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_0.5)) {
predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
#type1_indices <- true_gene_list_by_celltype[[cell_type]]
type3_indices <- true_gene_list_by_celltype[[cell_type]]
# Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
accuracy_list_0.5[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
Type3_accuracy = Type3_accuracy)
}
print(accuracy_list_0.5)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.052
##
##
## $CD4
## $CD4$Type3_accuracy
## [1] 0.048
##
##
## $CD8
## $CD8$Type3_accuracy
## [1] 0.104
##
##
## $NK
## $NK$Type3_accuracy
## [1] 0.072
##
##
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.06
##
##
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.204
accuracy_list_0.5_intercept <- accuracy_list_0.5
We use ROC curve to illustrate the result.
K = length(cellType_list)
G = length(gene_list)
flag = matrix(0, nrow = G, ncol = K)
rownames(flag) = gene_list
colnames(flag) = cellType_list
for (k in 1:K) {
type3_indices = DEG_list[[cellType_list[k]]]$Type3
flag[type3_indices, k] = 1
}
ROC.DE <- function(DE.gs, pval) {
pred = prediction(1-pval, DE.gs)
perf = performance(pred,"tpr","fpr")
perf
}
roc = ROC.DE(flag, N25_age.test_0.5)
# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")
# 初始化空数据框
roc_data <- data.frame()
# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
# 提取当前cellType的fpr和tpr
fpr <- fpr_list[[k]]
tpr <- tpr_list[[k]]
# 生成临时数据框并添加到总数据框中
temp_data <- data.frame(
FPR = fpr,
TPR = tpr,
CellType = cellType_list[k]
)
roc_data <- rbind(roc_data, temp_data)
}
# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
geom_line(size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(title = "ROC Curve for DE Gene Detection (LFC = 0.5)",
x = "False Positive Rate (1-Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 14)) +
scale_color_brewer(palette = "Set1")

ii. Slope test
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrepSlope(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## 1 2 3 4 5 6
## FGR 1734 7102 13741 929 16142 6666
## CFH 7 198 24 100 13 11
## FUCA2 91 41 4895 169 3025 181
## 126 127 128 129 130 131
## FGR 1169 10682 8723 6347 9193 5170
## CFH 2 71 6 107 642 6
## FUCA2 67 136 609 166 354 299
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "B-cells" "CD4" "CD8" "NK" "Neutrophils"
## [6] "Monocytes"
## Total sample number and subject number:
## [1] 250 50
## Total case number and ctrl number:
## [1] 25 25
## First several subject ID for the samples:
## [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope):
## [1] "slope"
execution_time <- system.time({
N25_age.test <- isletTest(input = N25_input)
})
print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_slope_test_0_5.rds")
We used BH procedure to control FDR in multiple testing, and then
reported csDEG at pValue < 0.05.
N25_age.test_0.5 <- readRDS("N25_slope_test_0_5.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_0.5))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")
csDEG <- p_values_long %>%
filter(PValue < 0.05)
top_csDEG <- csDEG %>%
group_by(CellType) %>%
arrange(CellType, PValue) %>%
ungroup()
gene_list_by_celltype <- top_csDEG %>%
split(.$CellType) %>%
lapply(function(df) df$Gene)
islet_gene_list_by_celltype <- list()
for (cell_type in names(gene_list_by_celltype)) {
factor_vector <- gene_list_by_celltype[[cell_type]]
gene_names <- as.character(factor_vector)
islet_gene_list_by_celltype[[cell_type]] <- gene_names
}
true_gene_list_by_celltype <- list()
for (cell_type in names(DEG_list)) {
#type1_indices <- DEG_list[[cell_type]]$Type1
#type3_indices <- DEG_list[[cell_type]]$Type3
all_indices <- DEG_list[[cell_type]]$Type3
genes <- gene_list[all_indices]
true_gene_list_by_celltype[[cell_type]] <- genes
}
num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type
accuracy_list_0.5 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_0.5)) {
predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
#type1_indices <- true_gene_list_by_celltype[[cell_type]]
type3_indices <- true_gene_list_by_celltype[[cell_type]]
# Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
accuracy_list_0.5[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
Type3_accuracy = Type3_accuracy)
}
print(accuracy_list_0.5)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.08
##
##
## $CD4
## $CD4$Type3_accuracy
## [1] 0.084
##
##
## $CD8
## $CD8$Type3_accuracy
## [1] 0.076
##
##
## $NK
## $NK$Type3_accuracy
## [1] 0.116
##
##
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.12
##
##
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.136
accuracy_list_0.5_slope <- accuracy_list_0.5
We use ROC curve to illustrate the result.
roc = ROC.DE(flag, N25_age.test_0.5)
# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")
# 初始化空数据框
roc_data <- data.frame()
# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
# 提取当前cellType的fpr和tpr
fpr <- fpr_list[[k]]
tpr <- tpr_list[[k]]
# 生成临时数据框并添加到总数据框中
temp_data <- data.frame(
FPR = fpr,
TPR = tpr,
CellType = cellType_list[k]
)
roc_data <- rbind(roc_data, temp_data)
}
# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
geom_line(size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(title = "ROC Curve for DE Gene Detection (LFC = 0.5)",
x = "False Positive Rate (1-Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 14)) +
scale_color_brewer(palette = "Set1")

7. \(\Delta_0\) = 1.0
ref = Gen_ref(J=50, T=5, mu_m = mu_m, sigma_m = sigma_m,
gene_ls = gene_list, cell_ls = cellType_list, LFC=1.0,
k_mu = 0.01, k_delta = 0.1)
meta_data <- ref$meta_data
lambda_list <- ref$lambda_ls
M_list <- ref$M_ls
Phi_list <- ref$Phi_ls
DEG_list <- ref$DEG_ls
head(meta_data)
## sample_ID group subject_ID time_point
## 1 1 case 1 1
## 2 2 case 1 2
## 3 3 case 1 3
## 4 4 case 1 4
## 5 5 case 1 5
## 6 6 case 2 1
Y <- Gen_mix(lambda_ls = lambda_list, theta = theta)
theta_df = as.data.frame(t(theta))
(1) Heatmap of \(M\) and \(\lambda\)
i. Mean expression \(M\)
heatmap_grobs <- lapply(c(1:5, 126:130), function(i) {
M <- M_list[[i]]
if (i %in% 1:5) {
create_heatmap_grob(M, paste("subject 1 (case) at time point ", i), color_palette) }
else {create_heatmap_grob(M, paste("subject 26 (ctrl) at time point ", i-125), color_palette)}
})
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 第i个热图
heatmap_grobs[[i + 5]], # 第i+5个热图
nrow = 1 # 一行两列
)
}





We examine whether DE genes we set are differentially expressed in
\(M\) by looking at the corresponding
index in case and control.
heatmap_grobs <- lapply(c(1:5), function(i) {
M_case <- M_list[[i]]
M_ctrl <- M_list[[i + 125]]
# 创建子集矩阵,只保留每列的250个元素
M_sub_case <- do.call(cbind, lapply(1:ncol(M_case), function(j) {
M_case[((j-1)*250 + 1):(j*250), j]
}))
M_sub_ctrl <- do.call(cbind, lapply(1:ncol(M_ctrl), function(j) {
M_ctrl[((j-1)*250 + 1):(j*250), j]
}))
# 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
M_combined <- cbind(M_sub_case, M_sub_ctrl)
# 创建热图 grob
create_heatmap_grob(M_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})
# 绘制热图
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 每个时间点的热图
nrow = 1 # 一行一列
)
}





ii. Ture reference panel \(\lambda\)
heatmap_grobs <- lapply(c(1:5), function(i) {
lambda_case <- lambda_list[[i]]
lambda_ctrl <- lambda_list[[i + 125]]
# 创建子集矩阵,只保留每列的250个元素
lambda_sub_case <- do.call(cbind, lapply(1:ncol(lambda_case), function(j) {
lambda_case[((j-1)*250 + 1):(j*250), j]
}))
lambda_sub_ctrl <- do.call(cbind, lapply(1:ncol(lambda_ctrl), function(j) {
lambda_ctrl[((j-1)*250 + 1):(j*250), j]
}))
# 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
lambda_combined <- cbind(lambda_sub_case, lambda_sub_ctrl)
# 创建热图 grob
create_heatmap_grob(lambda_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})
# 绘制热图
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 每个时间点的热图
nrow = 1 # 一行一列
)
}





(2) Deconvolve individual-specific reference panel
We deconvolve the individual-specific reference panel with
isletSolve() and compare the deconvolved reference panel
with true reference panel.
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
#age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrep(dat_se = N25_se)
N25_input
execution_time <- system.time({
N25_age.ref <- isletSolve(input=N25_input)
})
print(execution_time)
saveRDS(N25_age.ref, file="N25_ref_1_0.rds")
N25_age.ref <- readRDS("N25_ref_1_0.rds")
caseVal <- caseEst(N25_age.ref)
ctrlVal <- ctrlEst(N25_age.ref)
lambda_list_islet <- list()
for (i in 1:50) {
if (i %in% seq(1,25)) {
Bcell = caseVal[["B-cells"]][,i]
CD4 = caseVal[["CD4"]][,i]
CD8 = caseVal[["CD8"]][,i]
NK = caseVal[["NK"]][,i]
Neutrophils = caseVal[["Neutrophils"]][,i]
Monocytes = caseVal[["Monocytes"]][,i]
lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
colnames(lambda) = c("B-cells","CD4", "CD8", "NK", "Neutrophils", "Monocytes")
lambda_list_islet[[as.character(i)]] <- lambda
}
else {
Bcell = ctrlVal[["B-cells"]][,i-25]
CD4 = ctrlVal[["CD4"]][,i-25]
CD8 = ctrlVal[["CD8"]][,i-25]
NK = ctrlVal[["NK"]][,i-25]
Neutrophils = ctrlVal[["Neutrophils"]][,i-25]
Monocytes = ctrlVal[["Monocytes"]][,i-25]
lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
colnames(lambda) = c("B-cells","CD4", "CD8", "NK", "Neutrophils", "Monocytes")
lambda_list_islet[[as.character(i)]] <- lambda
}
}
true_ref_case <- lambda_list_islet[[1]][1:50, ]
islet_ref_case <- lambda_list[[1]][1:50, ]
ref_case_combined <- cbind(true_ref_case, islet_ref_case)
pheatmap(ref_case_combined,
cluster_rows = FALSE, # 关闭行聚类
cluster_cols = FALSE, # 关闭列聚类
show_rownames = FALSE, # 不显示行名
show_colnames = TRUE,
main = "True Reference Panel vs Deconvolved Reference Panel (Case)") # 显示列名

true_ref_ctrl <- lambda_list[[126]][1:50, ]
islet_ref_ctrl <- lambda_list_islet[[26]][1:50, ]
ref_ctrl_combined <- cbind(true_ref_ctrl, islet_ref_ctrl)
pheatmap(ref_ctrl_combined,
cluster_rows = FALSE, # 关闭行聚类
cluster_cols = FALSE, # 关闭列聚类
show_rownames = FALSE, # 不显示行名
show_colnames = TRUE,
main = "True Reference Panel vs Deconvolved Reference Panel (Ctrl)") # 显示列名

true_ref_case <- lambda_list[[1]]
islet_ref_case <- lambda_list_islet[[1]]
overall_correlation <- cor(c(true_ref_case), c(islet_ref_case))
print(paste("Correlation of case:", overall_correlation))
## [1] "Correlation of case: 0.251721318589678"
plot(c(log(true_ref_case+1)), c(log(islet_ref_case+1)),
main="ISLET estimated reference panel vs the true reference panel (Case)",
xlab="Log(True Reference)",
ylab="Log(Estimated Reference)",
pch=19,
col="blue")
abline(a=0, b=1, col="red", lwd=2)

true_ref_ctrl <- lambda_list[[126]]
islet_ref_ctrl <- lambda_list_islet[[26]]
overall_correlation <- cor(c(true_ref_ctrl), c(islet_ref_ctrl))
print(paste("Correlation of ctrl:", overall_correlation))
## [1] "Correlation of ctrl: 0.277678122962485"
plot(c(log(true_ref_ctrl+1)), c(log(islet_ref_ctrl+1)),
main="ISLET estimated reference panel vs the true reference panel (Ctrl)",
xlab="Log(True Reference)",
ylab="Log(Estimated Reference)",
pch=19,
col="blue")
abline(a=0, b=1, col="red", lwd=2)

(3) Test cell-type-specific differential expression (csDE)
genes
i. Intercept test
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
#age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrep(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## 1 2 3 4 5 6
## FGR 1885 10212 15038 1384 16607 6720
## CFH 10 315 44 191 25 22
## FUCA2 176 57 7455 282 3578 316
## 126 127 128 129 130 131
## FGR 1835 12316 10007 6748 10296 6258
## CFH 6 137 14 138 1011 13
## FUCA2 101 222 1086 309 499 474
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "B-cells" "CD4" "CD8" "NK" "Neutrophils"
## [6] "Monocytes"
## Total sample number and subject number:
## [1] 250 50
## Total case number and ctrl number:
## [1] 25 25
## First several subject ID for the samples:
## [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope):
## [1] "intercept"
execution_time <- system.time({
N25_age.test <- isletTest(input = N25_input)
})
print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_intercept_test_1_0.rds")
We used BH procedure to control FDR in multiple testing, and then
reported csDEG at pValue < 0.05.
N25_age.test_1.0 <- readRDS("N25_intercept_test_1_0.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_1.0))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")
csDEG <- p_values_long %>%
filter(PValue < 0.05)
top_csDEG <- csDEG %>%
group_by(CellType) %>%
arrange(CellType, PValue) %>%
ungroup()
gene_list_by_celltype <- top_csDEG %>%
split(.$CellType) %>%
lapply(function(df) df$Gene)
islet_gene_list_by_celltype <- list()
for (cell_type in names(gene_list_by_celltype)) {
factor_vector <- gene_list_by_celltype[[cell_type]]
gene_names <- as.character(factor_vector)
islet_gene_list_by_celltype[[cell_type]] <- gene_names
}
true_gene_list_by_celltype <- list()
for (cell_type in names(DEG_list)) {
#type1_indices <- DEG_list[[cell_type]]$Type1
#type3_indices <- DEG_list[[cell_type]]$Type3
all_indices <- DEG_list[[cell_type]]$Type3
genes <- gene_list[all_indices]
true_gene_list_by_celltype[[cell_type]] <- genes
}
num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type
accuracy_list_1.0 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_1.0)) {
predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
#type1_indices <- true_gene_list_by_celltype[[cell_type]]
type3_indices <- true_gene_list_by_celltype[[cell_type]]
#Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
accuracy_list_1.0[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
Type3_accuracy = Type3_accuracy)
}
print(accuracy_list_1.0)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.056
##
##
## $CD4
## $CD4$Type3_accuracy
## [1] 0.048
##
##
## $CD8
## $CD8$Type3_accuracy
## [1] 0.108
##
##
## $NK
## $NK$Type3_accuracy
## [1] 0.064
##
##
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.056
##
##
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.228
accuracy_list_1.0_intercept <- accuracy_list_1.0
roc = ROC.DE(flag, N25_age.test_1.0)
# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")
# 初始化空数据框
roc_data <- data.frame()
# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
# 提取当前cellType的fpr和tpr
fpr <- fpr_list[[k]]
tpr <- tpr_list[[k]]
# 生成临时数据框并添加到总数据框中
temp_data <- data.frame(
FPR = fpr,
TPR = tpr,
CellType = cellType_list[k]
)
roc_data <- rbind(roc_data, temp_data)
}
# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
geom_line(size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(title = "ROC Curve for DE Gene Detection (LFC = 1.0)",
x = "False Positive Rate (1-Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 14)) +
scale_color_brewer(palette = "Set1")

ii. Slope test
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrepSlope(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## 1 2 3 4 5 6
## FGR 1885 10212 15038 1384 16607 6720
## CFH 10 315 44 191 25 22
## FUCA2 176 57 7455 282 3578 316
## 126 127 128 129 130 131
## FGR 1835 12316 10007 6748 10296 6258
## CFH 6 137 14 138 1011 13
## FUCA2 101 222 1086 309 499 474
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "B-cells" "CD4" "CD8" "NK" "Neutrophils"
## [6] "Monocytes"
## Total sample number and subject number:
## [1] 250 50
## Total case number and ctrl number:
## [1] 25 25
## First several subject ID for the samples:
## [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope):
## [1] "slope"
execution_time <- system.time({
N25_age.test <- isletTest(input = N25_input)
})
print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_slope_test_1_0.rds")
We used BH procedure to control FDR in multiple testing, and then
reported csDEG at pValue < 0.05.
N25_age.test_1.0 <- readRDS("N25_slope_test_1_0.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_1.0))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")
csDEG <- p_values_long %>%
filter(PValue < 0.05)
top_csDEG <- csDEG %>%
group_by(CellType) %>%
arrange(CellType, PValue) %>%
ungroup()
gene_list_by_celltype <- top_csDEG %>%
split(.$CellType) %>%
lapply(function(df) df$Gene)
islet_gene_list_by_celltype <- list()
for (cell_type in names(gene_list_by_celltype)) {
factor_vector <- gene_list_by_celltype[[cell_type]]
gene_names <- as.character(factor_vector)
islet_gene_list_by_celltype[[cell_type]] <- gene_names
}
true_gene_list_by_celltype <- list()
for (cell_type in names(DEG_list)) {
#type1_indices <- DEG_list[[cell_type]]$Type1
#type3_indices <- DEG_list[[cell_type]]$Type3
all_indices <- DEG_list[[cell_type]]$Type3
genes <- gene_list[all_indices]
true_gene_list_by_celltype[[cell_type]] <- genes
}
num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type
accuracy_list_1.0 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_1.0)) {
predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
#type1_indices <- true_gene_list_by_celltype[[cell_type]]
type3_indices <- true_gene_list_by_celltype[[cell_type]]
#Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
accuracy_list_1.0[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
Type3_accuracy = Type3_accuracy)
}
print(accuracy_list_1.0)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.084
##
##
## $CD4
## $CD4$Type3_accuracy
## [1] 0.072
##
##
## $CD8
## $CD8$Type3_accuracy
## [1] 0.092
##
##
## $NK
## $NK$Type3_accuracy
## [1] 0.108
##
##
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.084
##
##
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.14
accuracy_list_1.0_slope <- accuracy_list_1.0
roc = ROC.DE(flag, N25_age.test_1.0)
# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")
# 初始化空数据框
roc_data <- data.frame()
# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
# 提取当前cellType的fpr和tpr
fpr <- fpr_list[[k]]
tpr <- tpr_list[[k]]
# 生成临时数据框并添加到总数据框中
temp_data <- data.frame(
FPR = fpr,
TPR = tpr,
CellType = cellType_list[k]
)
roc_data <- rbind(roc_data, temp_data)
}
# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
geom_line(size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(title = "ROC Curve for DE Gene Detection (LFC = 1.0)",
x = "False Positive Rate (1-Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 14)) +
scale_color_brewer(palette = "Set1")

8. \(\Delta_0\) = 1.5
ref = Gen_ref(J=50, T=5, mu_m = mu_m, sigma_m = sigma_m,
gene_ls = gene_list, cell_ls = cellType_list, LFC=1.5,
k_mu = 0.01, k_delta = 0.1)
meta_data <- ref$meta_data
lambda_list <- ref$lambda_ls
M_list <- ref$M_ls
Phi_list <- ref$Phi_ls
DEG_list <- ref$DEG_ls
head(meta_data)
## sample_ID group subject_ID time_point
## 1 1 case 1 1
## 2 2 case 1 2
## 3 3 case 1 3
## 4 4 case 1 4
## 5 5 case 1 5
## 6 6 case 2 1
Y <- Gen_mix(lambda_ls = lambda_list, theta = theta)
theta_df = as.data.frame(t(theta))
(1) Heatmap of \(M\) and \(\lambda\)
i. Mean expression \(M\)
heatmap_grobs <- lapply(c(1:5, 126:130), function(i) {
M <- M_list[[i]]
if (i %in% 1:5) {
create_heatmap_grob(M, paste("subject 1 (case) at time point ", i), color_palette) }
else {create_heatmap_grob(M, paste("subject 26 (ctrl) at time point ", i-125), color_palette)}
})
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 第i个热图
heatmap_grobs[[i + 5]], # 第i+5个热图
nrow = 1 # 一行两列
)
}





We examine whether DE genes we set are differentially expressed in
\(M\) by looking at the corresponding
index in case and control.
heatmap_grobs <- lapply(c(1:5), function(i) {
M_case <- M_list[[i]]
M_ctrl <- M_list[[i + 125]]
# 创建子集矩阵,只保留每列的250个元素
M_sub_case <- do.call(cbind, lapply(1:ncol(M_case), function(j) {
M_case[((j-1)*250 + 1):(j*250), j]
}))
M_sub_ctrl <- do.call(cbind, lapply(1:ncol(M_ctrl), function(j) {
M_ctrl[((j-1)*250 + 1):(j*250), j]
}))
# 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
M_combined <- cbind(M_sub_case, M_sub_ctrl)
# 创建热图 grob
create_heatmap_grob(M_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})
# 绘制热图
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 每个时间点的热图
nrow = 1 # 一行一列
)
}





ii. Ture reference panel \(\lambda\)
heatmap_grobs <- lapply(c(1:5), function(i) {
lambda_case <- lambda_list[[i]]
lambda_ctrl <- lambda_list[[i + 125]]
# 创建子集矩阵,只保留每列的250个元素
lambda_sub_case <- do.call(cbind, lapply(1:ncol(lambda_case), function(j) {
lambda_case[((j-1)*250 + 1):(j*250), j]
}))
lambda_sub_ctrl <- do.call(cbind, lapply(1:ncol(lambda_ctrl), function(j) {
lambda_ctrl[((j-1)*250 + 1):(j*250), j]
}))
# 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
lambda_combined <- cbind(lambda_sub_case, lambda_sub_ctrl)
# 创建热图 grob
create_heatmap_grob(lambda_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})
# 绘制热图
for (i in 1:5) {
grid.arrange(
heatmap_grobs[[i]], # 每个时间点的热图
nrow = 1 # 一行一列
)
}





(2) Deconvolve individual-specific reference panel
We deconvolve the individual-specific reference panel with
isletSolve() and compare the deconvolved reference panel
with true reference panel.
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
#age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrep(dat_se = N25_se)
N25_input
execution_time <- system.time({
N25_age.ref <- isletSolve(input=N25_input)
})
print(execution_time)
saveRDS(N25_age.ref, file="N25_ref_1_5.rds")
N25_age.ref <- readRDS("N25_ref_1_5.rds")
caseVal <- caseEst(N25_age.ref)
ctrlVal <- ctrlEst(N25_age.ref)
lambda_list_islet <- list()
for (i in 1:50) {
if (i %in% seq(1,25)) {
Bcell = caseVal[["B-cells"]][,i]
CD4 = caseVal[["CD4"]][,i]
CD8 = caseVal[["CD8"]][,i]
NK = caseVal[["NK"]][,i]
Neutrophils = caseVal[["Neutrophils"]][,i]
Monocytes = caseVal[["Monocytes"]][,i]
lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
colnames(lambda) = c("B-cells","CD4", "CD8", "NK", "Neutrophils", "Monocytes")
lambda_list_islet[[as.character(i)]] <- lambda
}
else {
Bcell = ctrlVal[["B-cells"]][,i-25]
CD4 = ctrlVal[["CD4"]][,i-25]
CD8 = ctrlVal[["CD8"]][,i-25]
NK = ctrlVal[["NK"]][,i-25]
Neutrophils = ctrlVal[["Neutrophils"]][,i-25]
Monocytes = ctrlVal[["Monocytes"]][,i-25]
lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
colnames(lambda) = c("B-cells","CD4", "CD8", "NK", "Neutrophils", "Monocytes")
lambda_list_islet[[as.character(i)]] <- lambda
}
}
true_ref_case <- lambda_list_islet[[1]][1:50, ]
islet_ref_case <- lambda_list[[1]][1:50, ]
ref_case_combined <- cbind(true_ref_case, islet_ref_case)
pheatmap(ref_case_combined,
cluster_rows = FALSE, # 关闭行聚类
cluster_cols = FALSE, # 关闭列聚类
show_rownames = FALSE, # 不显示行名
show_colnames = TRUE,
main = "True Reference Panel vs Deconvolved Reference Panel (Case)") # 显示列名

true_ref_ctrl <- lambda_list[[126]][1:50, ]
islet_ref_ctrl <- lambda_list_islet[[26]][1:50, ]
ref_ctrl_combined <- cbind(true_ref_ctrl, islet_ref_ctrl)
pheatmap(ref_ctrl_combined,
cluster_rows = FALSE, # 关闭行聚类
cluster_cols = FALSE, # 关闭列聚类
show_rownames = FALSE, # 不显示行名
show_colnames = TRUE,
main = "True Reference Panel vs Deconvolved Reference Panel (Ctrl)") # 显示列名

true_ref_case <- lambda_list[[1]]
islet_ref_case <- lambda_list_islet[[1]]
overall_correlation <- cor(c(true_ref_case), c(islet_ref_case))
print(paste("Correlation of case:", overall_correlation))
## [1] "Correlation of case: 0.281991831863852"
plot(c(log(true_ref_case+1)), c(log(islet_ref_case+1)),
main="ISLET estimated reference panel vs the true reference panel (Case)",
xlab="Log(True Reference)",
ylab="Log(Estimated Reference)",
pch=19,
col="blue")
abline(a=0, b=1, col="red", lwd=2)

true_ref_ctrl <- lambda_list[[126]]
islet_ref_ctrl <- lambda_list_islet[[26]]
overall_correlation <- cor(c(true_ref_ctrl), c(islet_ref_ctrl))
print(paste("Correlation of ctrl:", overall_correlation))
## [1] "Correlation of ctrl: 0.308103086042903"
plot(c(log(true_ref_ctrl+1)), c(log(islet_ref_ctrl+1)),
main="ISLET estimated reference panel vs the true reference panel (Ctrl)",
xlab="Log(True Reference)",
ylab="Log(Estimated Reference)",
pch=19,
col="blue")
abline(a=0, b=1, col="red", lwd=2)

(3) Test cell-type-specific differential expression (csDE)
genes
i. Intercept test
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
#age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrep(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## 1 2 3 4 5 6
## FGR 2133 15317 17342 1744 16880 7120
## CFH 18 519 58 332 37 47
## FUCA2 276 94 7133 501 4472 455
## 126 127 128 129 130 131
## FGR 3027 15125 12162 7453 12052 6922
## CFH 9 203 25 276 1764 22
## FUCA2 160 374 1745 522 786 756
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "B-cells" "CD4" "CD8" "NK" "Neutrophils"
## [6] "Monocytes"
## Total sample number and subject number:
## [1] 250 50
## Total case number and ctrl number:
## [1] 25 25
## First several subject ID for the samples:
## [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope):
## [1] "intercept"
execution_time <- system.time({
N25_age.test <- isletTest(input = N25_input)
})
print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_intercept_test_1_5.rds")
We used BH procedure to control FDR in multiple testing, and then
reported csDEG at pValue < 0.05.
N25_age.test_1.5 <- readRDS("N25_intercept_test_1_5.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_1.5))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")
csDEG <- p_values_long %>%
filter(PValue < 0.05)
top_csDEG <- csDEG %>%
group_by(CellType) %>%
arrange(CellType, PValue) %>%
ungroup()
gene_list_by_celltype <- top_csDEG %>%
split(.$CellType) %>%
lapply(function(df) df$Gene)
islet_gene_list_by_celltype <- list()
for (cell_type in names(gene_list_by_celltype)) {
factor_vector <- gene_list_by_celltype[[cell_type]]
gene_names <- as.character(factor_vector)
islet_gene_list_by_celltype[[cell_type]] <- gene_names
}
true_gene_list_by_celltype <- list()
for (cell_type in names(DEG_list)) {
#type1_indices <- DEG_list[[cell_type]]$Type1
#type3_indices <- DEG_list[[cell_type]]$Type3
all_indices <- DEG_list[[cell_type]]$Type3
genes <- gene_list[all_indices]
true_gene_list_by_celltype[[cell_type]] <- genes
}
num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type
accuracy_list_1.5 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_1.5)) {
predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
#type1_indices <- true_gene_list_by_celltype[[cell_type]]
type3_indices <- true_gene_list_by_celltype[[cell_type]]
#Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
accuracy_list_1.5[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
Type3_accuracy = Type3_accuracy)
}
print(accuracy_list_1.5)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.06
##
##
## $CD4
## $CD4$Type3_accuracy
## [1] 0.052
##
##
## $CD8
## $CD8$Type3_accuracy
## [1] 0.12
##
##
## $NK
## $NK$Type3_accuracy
## [1] 0.08
##
##
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.064
##
##
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.264
accuracy_list_1.5_intercept <- accuracy_list_1.5
roc = ROC.DE(flag, N25_age.test_1.5)
# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")
# 初始化空数据框
roc_data <- data.frame()
# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
# 提取当前cellType的fpr和tpr
fpr <- fpr_list[[k]]
tpr <- tpr_list[[k]]
# 生成临时数据框并添加到总数据框中
temp_data <- data.frame(
FPR = fpr,
TPR = tpr,
CellType = cellType_list[k]
)
roc_data <- rbind(roc_data, temp_data)
}
# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
geom_line(size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(title = "ROC Curve for DE Gene Detection (LFC = 1.5)",
x = "False Positive Rate (1-Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 14)) +
scale_color_brewer(palette = "Set1")

ii. Slope test
sample_info <- data.frame(group = meta_data$group,
subject_ID = meta_data$subject_ID,
age = meta_data$time_point,
'B-cells' = theta_df$`B-cells`,
'CD4' = theta_df$CD4,
'CD8' = theta_df$CD8,
'NK' = theta_df$NK,
'Neutrophils' = theta_df$Neutrophils,
'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"
N25_se <- SummarizedExperiment(
assays = list(counts = as.data.frame(Y)),
colData = sample_info)
N25_input <- dataPrepSlope(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## 1 2 3 4 5 6
## FGR 2133 15317 17342 1744 16880 7120
## CFH 18 519 58 332 37 47
## FUCA2 276 94 7133 501 4472 455
## 126 127 128 129 130 131
## FGR 3027 15125 12162 7453 12052 6922
## CFH 9 203 25 276 1764 22
## FUCA2 160 374 1745 522 786 756
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "B-cells" "CD4" "CD8" "NK" "Neutrophils"
## [6] "Monocytes"
## Total sample number and subject number:
## [1] 250 50
## Total case number and ctrl number:
## [1] 25 25
## First several subject ID for the samples:
## [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope):
## [1] "slope"
execution_time <- system.time({
N25_age.test <- isletTest(input = N25_input)
})
print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_slope_test_1_5.rds")
We used BH procedure to control FDR in multiple testing, and then
reported csDEG at pValue < 0.05.
N25_age.test_1.5 <- readRDS("N25_slope_test_1_5.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_1.5))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")
csDEG <- p_values_long %>%
filter(PValue < 0.05)
top_csDEG <- csDEG %>%
group_by(CellType) %>%
arrange(CellType, PValue) %>%
ungroup()
gene_list_by_celltype <- top_csDEG %>%
split(.$CellType) %>%
lapply(function(df) df$Gene)
islet_gene_list_by_celltype <- list()
for (cell_type in names(gene_list_by_celltype)) {
factor_vector <- gene_list_by_celltype[[cell_type]]
gene_names <- as.character(factor_vector)
islet_gene_list_by_celltype[[cell_type]] <- gene_names
}
true_gene_list_by_celltype <- list()
for (cell_type in names(DEG_list)) {
#type1_indices <- DEG_list[[cell_type]]$Type1
#type3_indices <- DEG_list[[cell_type]]$Type3
all_indices <- DEG_list[[cell_type]]$Type3
genes <- gene_list[all_indices]
true_gene_list_by_celltype[[cell_type]] <- genes
}
num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type
accuracy_list_1.5 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_1.5)) {
predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
#type1_indices <- true_gene_list_by_celltype[[cell_type]]
type3_indices <- true_gene_list_by_celltype[[cell_type]]
#Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
accuracy_list_1.5[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
Type3_accuracy = Type3_accuracy)
}
print(accuracy_list_1.5)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.084
##
##
## $CD4
## $CD4$Type3_accuracy
## [1] 0.072
##
##
## $CD8
## $CD8$Type3_accuracy
## [1] 0.092
##
##
## $NK
## $NK$Type3_accuracy
## [1] 0.1
##
##
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.06
##
##
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.136
accuracy_list_1.5_slope <- accuracy_list_1.5
roc = ROC.DE(flag, N25_age.test_1.5)
# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")
# 初始化空数据框
roc_data <- data.frame()
# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
# 提取当前cellType的fpr和tpr
fpr <- fpr_list[[k]]
tpr <- tpr_list[[k]]
# 生成临时数据框并添加到总数据框中
temp_data <- data.frame(
FPR = fpr,
TPR = tpr,
CellType = cellType_list[k]
)
roc_data <- rbind(roc_data, temp_data)
}
# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
geom_line(size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(title = "ROC Curve for DE Gene Detection (LFC = 1.5)",
x = "False Positive Rate (1-Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 14)) +
scale_color_brewer(palette = "Set1")
